Data provider country statistics
New OBIS datasets since 2021
Load data
readRenviron("env.txt")
con <- dbConnect(RPostgres::Postgres(), dbname = Sys.getenv("OBIS_DB"), host = Sys.getenv("OBIS_HOST"), port = Sys.getenv("OBIS_PORT"), user = Sys.getenv("OBIS_USER"), password = Sys.getenv("OBIS_PASSWORD"), bigint = "integer")
res <- dbSendQuery(con, glue("
select
datasets.id,
datasets.url,
(datasets.statistics->>'Occurrence')::int as records,
extract('year' from datasets.created) as year,
persons.organization,
persons.country as person_country,
ih.country as inst_country,
persons.oceanexpert_institution_id
from datasets
left join metadata.lnk_dataset_person ldp on ldp.dataset_id = datasets.id
left join metadata.persons on ldp.person_id = persons.id
left join metadata.institution_hierarchy ih on ih.id = persons.oceanexpert_institution_id
order by (datasets.statistics->>'Occurrence')::int desc nulls last, datasets.id
"))
datasets <- dbFetch(res) %>%
mutate(inst_country = stringr::str_replace(inst_country, "Réunion", "France")) %>%
mutate(inst_country = stringr::str_replace(inst_country, "Wallis Futuna Islands", "Wallis and Futuna Islands")) %>%
distinct()Cleanup country names
country_codes <- countrycode::codelist %>% select(name = un.name.en, iso2c) %>%
filter(!is.na(iso2c))
datasets <- datasets %>%
left_join(country_codes, by = c("person_country" = "iso2c")) %>%
rowwise() %>%
mutate(country = ifelse(!is.na(inst_country), inst_country, name)) %>%
mutate(across(where(is.character), ~na_if(., ""))) %>%
filter(!is.na(country))
head(datasets)## # A tibble: 6 × 10
## # Rowwise:
## id url records year organization person_country inst_country
## <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 46389886-a09c-4c… http… 8331913 2023 Flanders Ma… BE Belgium
## 2 efa02fe9-6b5b-41… http… 4583637 2020 CSIRO Natio… AU Australia
## 3 efa02fe9-6b5b-41… http… 4583637 2020 CSIRO Ocean… AU Australia
## 4 80479e14-2730-43… http… 3134870 2020 National Oc… <NA> United Stat…
## 5 80479e14-2730-43… http… 3134870 2020 U.S. Geolog… <NA> United Stat…
## 6 8b0d5fdd-6a3f-48… http… 3071809 2020 School of E… AU Australia
## # ℹ 3 more variables: oceanexpert_institution_id <int>, name <chr>,
## # country <chr>
Add region info
countries <- m49::m49_full %>% mutate(country_or_area = stringr::str_replace(country_or_area, "Turkey", "Türkiye")) %>% select(country_or_area, ldc, lldc, sids, developed, developing, region_name) %>%
mutate(classification = factor(ifelse(sids, "SIDS", ifelse(region_name == "Africa", "Africa", "other")), levels = c("SIDS", "Africa", "other")))
datasets <- datasets %>%
left_join(countries, by = c("country" = "country_or_area"))Statistics
recent_datasets <- datasets %>%
filter(year >= start_year & year <= end_year)
recent_stats <- recent_datasets %>%
distinct(id, records, year, country, sids, region_name, classification) %>%
filter(!is.na(country)) %>%
group_by(country, year, sids, region_name, classification) %>%
summarize(datasets = n(), records = sum(records)) %>%
arrange(desc(records))
head(recent_stats)## # A tibble: 6 × 7
## # Groups: country, year, sids, region_name [6]
## country year sids region_name classification datasets records
## <chr> <dbl> <lgl> <chr> <fct> <int> <int>
## 1 Belgium 2023 FALSE Europe other 26 1.33e7
## 2 United States of Amer… 2021 FALSE Americas other 162 6.75e6
## 3 Canada 2021 FALSE Americas other 45 6.58e6
## 4 Belgium 2022 FALSE Europe other 60 5.18e6
## 5 Denmark 2023 FALSE Europe other 9 4.58e6
## 6 Netherlands 2022 FALSE Europe other 16 4.03e6
write.csv(recent_stats, file = "output/stats.csv", row.names = FALSE)
recent_stats %>%
relocate(country, year, records) %>% rmarkdown::paged_table(options = list(rows.print = 100))Visualize
stats_all_years <- recent_stats %>%
group_by(country) %>%
summarize(records = sum(records))
recent_stats <- recent_stats %>%
mutate(country = factor(country, levels = stats_all_years$country[order(stats_all_years$records)]))
ggplot(data = recent_stats) +
geom_bar(aes(x = country, y = records, fill = classification), stat = "identity") +
theme_minimal() +
coord_flip() +
scale_fill_manual(values = c("#e9933e", "#f3c654", "#cccccc")) +
ggtitle("Biodiversity records contributed to OBIS by country") +
scale_y_continuous(labels = scales::label_comma()) +
facet_wrap(~year)ggsave("output/graph_untransformed.png", width = 12, height = 7, dpi = 300, scale = 1.2, bg = "white")
ggplot(data = recent_stats) +
geom_bar(aes(x = country, y = records, fill = classification), stat = "identity") +
theme_minimal() +
coord_flip() +
scale_fill_manual(values = c("#e9933e", "#f3c654", "#cccccc")) +
ggtitle("Biodiversity records contributed to OBIS by country") +
scale_y_continuous(labels = scales::label_comma(), trans = "log10") +
facet_wrap(~year)Statistics
Records and datasets per year:
recent_datasets %>%
distinct(id, year, records) %>%
group_by(year) %>%
summarize(records = sum(records, na.rm = TRUE), datasets = n())## # A tibble: 4 × 3
## year records datasets
## <dbl> <int> <int>
## 1 2021 17326530 649
## 2 2022 14415793 394
## 3 2023 27496526 454
## 4 2024 5169415 270
Countries per year and classification:
recent_stats %>%
group_by(year, classification) %>%
summarize(records = sum(records, na.rm = TRUE), datasets = n(), countries = length(unique(country)))## # A tibble: 12 × 5
## # Groups: year [4]
## year classification records datasets countries
## <dbl> <fct> <int> <int> <int>
## 1 2021 SIDS 68960 4 4
## 2 2021 Africa 1982 3 3
## 3 2021 other 22061341 37 37
## 4 2022 SIDS 1054697 10 10
## 5 2022 Africa 252115 5 5
## 6 2022 other 19782979 42 42
## 7 2023 SIDS 87680 5 5
## 8 2023 Africa 24209 5 5
## 9 2023 other 35545204 32 32
## 10 2024 SIDS 411051 5 5
## 11 2024 Africa 11528 4 4
## 12 2024 other 6075528 35 35
All OBIS datasets so far
stats <- datasets %>%
distinct(id, records, year, country, sids, region_name, classification) %>%
filter(!is.na(country)) %>%
group_by(country, year, sids, region_name, classification) %>%
summarize(datasets = n(), records = sum(records)) %>%
arrange(desc(records))
stats %>%
group_by(country, classification) %>%
summarize(min_year = min(year)) %>%
group_by(classification, min_year) %>%
summarize(countries = n()) %>%
arrange(classification, !is.na(min_year), min_year) %>%
group_by(classification) %>%
mutate(cumulative_countries = cumsum(countries))## # A tibble: 14 × 4
## # Groups: classification [3]
## classification min_year countries cumulative_countries
## <fct> <dbl> <int> <int>
## 1 SIDS NA 13 13
## 2 SIDS 2019 1 14
## 3 SIDS 2020 3 17
## 4 SIDS 2021 1 18
## 5 SIDS 2022 3 21
## 6 Africa NA 14 14
## 7 Africa 2021 1 15
## 8 Africa 2022 1 16
## 9 other NA 53 53
## 10 other 2019 1 54
## 11 other 2020 2 56
## 12 other 2022 3 59
## 13 other 2023 1 60
## 14 other 2024 1 61
BioEco GeoNode monitoring programmes
layers <- jsonlite::fromJSON("https://geonode.goosocean.org/api/layers/")$objects
stopifnot(nrow(layers) < 1000)
regions <- unlist(layers$regions)Fix region names:
## [1] "Global" "Mediterranean Sea"
## [3] "Alaska" "California"
## [5] "Baltic Sea" "Queensland"
## [7] "Southern Ocean" "Turkey"
## [9] "Newfoundland and Labrador" "Faeroe Islands"
## [11] "United Kingdom" "Americas"
## [13] "Scotland" "Europe"
## [15] "North America" "Pacific"
## [17] "Baker Island" "Hawaii"
## [19] "Northwestern Hawaiian Islands" "Howland Island"
## [21] "Jarvis Islands" "Johnston Atoll"
## [23] "Kingman Reef" "Micronesia, Federated States of"
## [25] "Midway Islands" "Palmyra Atoll"
## [27] "Wake Island" "South America"
## [29] "Caribbean" "Asia"
## [31] "Middle East" "Cape Verde"
regions <- plyr::revalue(regions, c(
"United Kingdom" = "United Kingdom of Great Britain and Northern Ireland",
"Scotland" = "United Kingdom of Great Britain and Northern Ireland",
"Hawaii" = "United States of America",
"Alaska" = "United States of America",
"Queensland" = "Australia",
"Queensland" = "Australia",
"Newfoundland and Labrador" = "Canada",
"California" = "United States of America",
"Turkey" = "Türkiye",
"Micronesia, Federated States of" = "Micronesia (Federated States of)",
"Cape Verde" = "Cabo Verde",
"Faeroe Islands" = "Faroe Islands",
"Baker Island" = "United States of America",
"Northwestern Hawaiian Islands" = "United States of America",
"Howland Island" = "United States of America",
"Jarvis Islands" = "United States of America",
"Johnston Atoll" = "United States of America",
"Kingman Reef" = "United States of America",
"Micronesia, Federated States of" = "Micronesia (Federated States of)",
"Midway Islands" = "United States of America",
"Palmyra Atoll" = "United States of America",
"Wake Island" = "United States of America",
"Cape Verde" = "Cabo Verde"
))bioeco_countries <- countries %>%
filter(country_or_area %in% regions)
bioeco_countries %>%
group_by(classification) %>%
summarize(n())## # A tibble: 4 × 2
## classification `n()`
## <fct> <int>
## 1 SIDS 14
## 2 Africa 7
## 3 other 50
## 4 <NA> 1